`
Load the tweets and check if they are loaded correctly. We also check the summary for a first interpretation. The summary(tweets) output reveals the following:
# Set working directory
# getwd()
# setwd("./data/")
# Load data
load("../data/Tweets_all.rda")
# Check that tweets are loaded
head(tweets)
## # A tibble: 6 × 14
## created_at id id_str full_text in_reply_to_screen_n…¹
## <dttm> <dbl> <chr> <chr> <chr>
## 1 2023-01-20 17:17:32 1.62e18 1616469988369469… "Im MSc … <NA>
## 2 2023-01-13 07:52:01 1.61e18 1613790954737074… "Was bew… <NA>
## 3 2023-01-12 19:30:01 1.61e18 1613604227141537… "Was uns… <NA>
## 4 2023-01-12 08:23:00 1.61e18 1613436367169634… "Eine di… <NA>
## 5 2023-01-11 14:00:05 1.61e18 1613158809081450… "Wir gra… <NA>
## 6 2023-01-10 17:06:11 1.61e18 1612843252083834… "Unsere … <NA>
## # ℹ abbreviated name: ¹in_reply_to_screen_name
## # ℹ 9 more variables: retweet_count <int>, favorite_count <int>, lang <chr>,
## # university <chr>, tweet_date <dttm>, tweet_minute <dttm>,
## # tweet_hour <dttm>, tweet_month <date>, timeofday_hour <chr>
summary(tweets)
## created_at id id_str
## Min. :2009-09-29 14:29:47.0 Min. :4.469e+09 Length:19575
## 1st Qu.:2015-01-28 15:07:41.5 1st Qu.:5.604e+17 Class :character
## Median :2018-04-13 13:26:56.0 Median :9.848e+17 Mode :character
## Mean :2017-12-09 15:26:50.7 Mean :9.400e+17
## 3rd Qu.:2020-10-20 10:34:50.0 3rd Qu.:1.318e+18
## Max. :2023-01-26 14:49:31.0 Max. :1.619e+18
## full_text in_reply_to_screen_name retweet_count favorite_count
## Length:19575 Length:19575 Min. : 0.000 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.00
## Mode :character Mode :character Median : 1.000 Median : 0.00
## Mean : 1.289 Mean : 1.37
## 3rd Qu.: 2.000 3rd Qu.: 2.00
## Max. :267.000 Max. :188.00
## lang university tweet_date
## Length:19575 Length:19575 Min. :2009-09-29 00:00:00.00
## Class :character Class :character 1st Qu.:2015-01-28 00:00:00.00
## Mode :character Mode :character Median :2018-04-13 00:00:00.00
## Mean :2017-12-09 02:25:45.00
## 3rd Qu.:2020-10-20 00:00:00.00
## Max. :2023-01-26 00:00:00.00
## tweet_minute tweet_hour
## Min. :2009-09-29 14:29:00.00 Min. :2009-09-29 14:00:00.00
## 1st Qu.:2015-01-28 15:07:00.00 1st Qu.:2015-01-28 14:30:00.00
## Median :2018-04-13 13:26:00.00 Median :2018-04-13 13:00:00.00
## Mean :2017-12-09 15:26:24.68 Mean :2017-12-09 14:59:43.81
## 3rd Qu.:2020-10-20 10:34:30.00 3rd Qu.:2020-10-20 10:00:00.00
## Max. :2023-01-26 14:49:00.00 Max. :2023-01-26 14:00:00.00
## tweet_month timeofday_hour
## Min. :2009-09-01 Length:19575
## 1st Qu.:2015-01-01 Class :character
## Median :2018-04-01 Mode :character
## Mean :2017-11-24
## 3rd Qu.:2020-10-01
## Max. :2023-01-01
Start preprocessing the tweets, to calculate the intervalls some additional properties are needed. The preprocessing steps transform raw tweet data into a structured format suitable for analysis. This includes:
# Preprocessing Step: Convert date and time to POSIXct and format according to date, year and university. Detect language and extract emojis. The days are sorted from the system locale starting from monday
tweets <- tweets %>%
mutate(
created_at = as.POSIXct(created_at, format = "%Y-%m-%d %H:%M:%S"),
date = as.Date(created_at),
day = lubridate::wday(created_at,
label = TRUE, abbr = FALSE,
week_start = getOption("lubridate.week.start", 1),
locale = Sys.getlocale("LC_TIME")
),
year = year(created_at),
month = floor_date(created_at, "month"),
university = as.character(university),
lang = detect_language(full_text),
full_text_emojis = replace_emoji(full_text, emoji_dt = lexicon::hash_emojis)
)
# Remove Emoji Tags helper funciton
# replace emoji places the emojis in the text as tags and their name, we remove them here
remove_emoji_tags <- function(text) {
str_remove_all(text, "<[a-z0-9]{2}>")
}
# Remove Emoji Tags
tweets$full_text_emojis <- sapply(tweets$full_text_emojis, remove_emoji_tags)
# Store emojis in a sep arate column to analyze later
tweets$emoji_unicode <- tweets %>%
emoji_extract_nest(full_text) %>%
select(.emoji_unicode)
# Count each tweet by university and hour of the day
tweet_counts_by_hour_of_day <- tweets %>%
group_by(university, timeofday_hour) %>%
count() %>%
arrange(university, timeofday_hour)
# Plot the number of tweets by university and hour of the day
ggplot(
tweet_counts_by_hour_of_day,
aes(
x = timeofday_hour, y = n,
color = university, group = university
)
) +
geom_line() +
facet_wrap(~university) +
labs(
title = "Number of tweets by university and hour",
x = "Hour of day",
y = "Number of tweets"
)
# Show most active hours for each university
hours_with_most_tweets_by_uni <- tweet_counts_by_hour_of_day %>%
group_by(university, timeofday_hour) %>%
summarize(total_tweets = sum(n)) %>%
group_by(university) %>%
slice_max(n = 1, order_by = total_tweets)
print(hours_with_most_tweets_by_uni)
## # A tibble: 8 × 3
## # Groups: university [8]
## university timeofday_hour total_tweets
## <chr> <chr> <int>
## 1 FHNW 09 344
## 2 FH_Graubuenden 11 493
## 3 ZHAW 17 580
## 4 bfh 08 497
## 5 hes_so 10 315
## 6 hslu 09 380
## 7 ost_fh 08 44
## 8 supsi_ch 11 330
# Show most active hour overall
hour_with_most_tweets <- tweet_counts_by_hour_of_day %>%
group_by(timeofday_hour) %>%
summarize(total_tweets = sum(n)) %>%
arrange(desc(total_tweets)) %>%
slice_max(n = 1, order_by = total_tweets)
print(hour_with_most_tweets)
## # A tibble: 1 × 2
## timeofday_hour total_tweets
## <chr> <int>
## 1 11 2356
# Count each tweet by university and weekday
tweet_counts_by_week_day <- tweets %>%
group_by(university, day) %>%
count() %>%
arrange(university, day)
# Plot the number of tweets by university and day of the week
ggplot(
tweet_counts_by_week_day,
aes(
x = day, y = n,
color = university,
group = university
)
) +
geom_line() +
facet_wrap(~university) +
labs(
title = "Number of tweets by university and day of the week",
x = "Day of the week",
y = "Number of tweets"
)
# Show most active days for each university
days_with_most_tweets_by_uni <- tweet_counts_by_week_day %>%
group_by(university, day) %>%
summarize(total_tweets = sum(n)) %>%
group_by(university) %>%
slice_max(n = 1, order_by = total_tweets)
print(days_with_most_tweets_by_uni)
## # A tibble: 8 × 3
## # Groups: university [8]
## university day total_tweets
## <chr> <ord> <int>
## 1 FHNW Tuesday 575
## 2 FH_Graubuenden Tuesday 751
## 3 ZHAW Wednesday 636
## 4 bfh Tuesday 651
## 5 hes_so Tuesday 415
## 6 hslu Thursday 603
## 7 ost_fh Friday 65
## 8 supsi_ch Friday 461
# Calculate time intervals between tweets
find_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
tweets <- tweets %>%
arrange(university, created_at) %>%
group_by(university) %>%
mutate(time_interval = as.numeric(
difftime(created_at, lag(created_at), units = "mins")
))
# Descriptive statistics of time intervals
summary(tweets$time_interval)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 148.2 1128.8 2097.6 2428.3 220707.0 8
# setwd("../4.Text-Mining-Groupwork/plots")
unique_years <- tweets$year %>% unique()
# Pilot distribution of time intervals between tweets for each year
for (curr_year in unique_years) {
# Filter data for the specific year
filtered_data <- tweets %>%
filter(year(created_at) == curr_year)
print(ggplot(filtered_data, aes(x = time_interval)) +
geom_histogram(fill = "lightblue") +
facet_wrap(~university) +
labs(
title = paste0(
"Distribution of time intervals between tweets - ", curr_year
),
x = "Time interval (minutes)",
y = "Tweet count"
))
universities <- filtered_data$university %>% unique()
for (uni in universities) {
# Filter data for the specific university
uni_filtered_data <- filtered_data %>%
filter(university == uni)
print(ggplot(uni_filtered_data, aes(x = time_interval)) +
geom_histogram(fill = "lightblue") +
labs(
title = paste0(
"Distribution of time intervals between tweets for ", uni,
" in ", curr_year
),
x = "Time interval (minutes)",
y = "Tweet count"
))
# Calculate mode (most common interval) in hours
most_common_interval_minutes <- find_mode(uni_filtered_data$time_interval)
most_common_interval_hours <- most_common_interval_minutes / 60
print(paste0(
"Most common time interval for ", uni,
" in ",
curr_year,
" is ", most_common_interval_minutes,
" minutes (", most_common_interval_hours, " hours)"
))
}
}
## [1] "Most common time interval for FHNW in 2011 is NA minutes (NA hours)"
## [1] "Most common time interval for FH_Graubuenden in 2011 is 23210.3 minutes (386.838333333333 hours)"
## [1] "Most common time interval for hes_so in 2011 is 1.55 minutes (0.0258333333333333 hours)"
## [1] "Most common time interval for FHNW in 2012 is 17324.65 minutes (288.744166666667 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2012 is 0.9 minutes (0.015 hours)"
## [1] "Most common time interval for ZHAW in 2012 is NA minutes (NA hours)"
## [1] "Most common time interval for bfh in 2012 is NA minutes (NA hours)"
## [1] "Most common time interval for hes_so in 2012 is 22086.35 minutes (368.105833333333 hours)"
## [1] "Most common time interval for FHNW in 2013 is 1.26666666666667 minutes (0.0211111111111111 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2013 is 21879.45 minutes (364.6575 hours)"
## [1] "Most common time interval for ZHAW in 2013 is 0.583333333333333 minutes (0.00972222222222222 hours)"
## [1] "Most common time interval for bfh in 2013 is 65.0833333333333 minutes (1.08472222222222 hours)"
## [1] "Most common time interval for hes_so in 2013 is 36252.5833333333 minutes (604.209722222222 hours)"
## [1] "Most common time interval for supsi_ch in 2013 is 0.783333333333333 minutes (0.0130555555555556 hours)"
## [1] "Most common time interval for FHNW in 2014 is 4.58333333333333 minutes (0.0763888888888889 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2014 is 0.183333333333333 minutes (0.00305555555555556 hours)"
## [1] "Most common time interval for ZHAW in 2014 is 0.05 minutes (0.000833333333333333 hours)"
## [1] "Most common time interval for bfh in 2014 is 153.35 minutes (2.55583333333333 hours)"
## [1] "Most common time interval for hes_so in 2014 is 21986.6 minutes (366.443333333333 hours)"
## [1] "Most common time interval for supsi_ch in 2014 is 37496.4833333333 minutes (624.941388888889 hours)"
## [1] "Most common time interval for FHNW in 2015 is 48918.3 minutes (815.305 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2015 is 1139.9 minutes (18.9983333333333 hours)"
## [1] "Most common time interval for ZHAW in 2015 is 0.316666666666667 minutes (0.00527777777777778 hours)"
## [1] "Most common time interval for bfh in 2015 is 20272.0333333333 minutes (337.867222222222 hours)"
## [1] "Most common time interval for hes_so in 2015 is 0.166666666666667 minutes (0.00277777777777778 hours)"
## [1] "Most common time interval for supsi_ch in 2015 is 43496.6333333333 minutes (724.943888888889 hours)"
## [1] "Most common time interval for FHNW in 2016 is 34708.6666666667 minutes (578.477777777778 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2016 is 240.05 minutes (4.00083333333333 hours)"
## [1] "Most common time interval for ZHAW in 2016 is 21.2 minutes (0.353333333333333 hours)"
## [1] "Most common time interval for bfh in 2016 is 0.0833333333333333 minutes (0.00138888888888889 hours)"
## [1] "Most common time interval for hes_so in 2016 is 2.7 minutes (0.045 hours)"
## [1] "Most common time interval for hslu in 2016 is NA minutes (NA hours)"
## [1] "Most common time interval for supsi_ch in 2016 is 1.58333333333333 minutes (0.0263888888888889 hours)"
## [1] "Most common time interval for FHNW in 2017 is 48748.5333333333 minutes (812.475555555556 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2017 is 5617.83333333333 minutes (93.6305555555555 hours)"
## [1] "Most common time interval for ZHAW in 2017 is 6954.43333333333 minutes (115.907222222222 hours)"
## [1] "Most common time interval for bfh in 2017 is 18606.6666666667 minutes (310.111111111111 hours)"
## [1] "Most common time interval for hes_so in 2017 is 71909.9833333333 minutes (1198.49972222222 hours)"
## [1] "Most common time interval for hslu in 2017 is 0.266666666666667 minutes (0.00444444444444444 hours)"
## [1] "Most common time interval for supsi_ch in 2017 is 1.36666666666667 minutes (0.0227777777777778 hours)"
## [1] "Most common time interval for FHNW in 2018 is 0.166666666666667 minutes (0.00277777777777778 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2018 is 1446.23333333333 minutes (24.1038888888889 hours)"
## [1] "Most common time interval for ZHAW in 2018 is 5689.93333333333 minutes (94.8322222222222 hours)"
## [1] "Most common time interval for bfh in 2018 is 20172.05 minutes (336.200833333333 hours)"
## [1] "Most common time interval for hes_so in 2018 is 31170.8333333333 minutes (519.513888888889 hours)"
## [1] "Most common time interval for hslu in 2018 is 0.233333333333333 minutes (0.00388888888888889 hours)"
## [1] "Most common time interval for supsi_ch in 2018 is 0.183333333333333 minutes (0.00305555555555556 hours)"
## [1] "Most common time interval for FHNW in 2019 is 315.233333333333 minutes (5.25388888888889 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2019 is 10079.85 minutes (167.9975 hours)"
## [1] "Most common time interval for ZHAW in 2019 is 1255.61666666667 minutes (20.9269444444444 hours)"
## [1] "Most common time interval for bfh in 2019 is 1440.05 minutes (24.0008333333333 hours)"
## [1] "Most common time interval for hes_so in 2019 is 1140.03333333333 minutes (19.0005555555556 hours)"
## [1] "Most common time interval for hslu in 2019 is 1.95 minutes (0.0325 hours)"
## [1] "Most common time interval for supsi_ch in 2019 is 15 minutes (0.25 hours)"
## [1] "Most common time interval for FHNW in 2020 is 3180.16666666667 minutes (53.0027777777778 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2020 is 2880.03333333333 minutes (48.0005555555556 hours)"
## [1] "Most common time interval for ZHAW in 2020 is 13693.7666666667 minutes (228.229444444444 hours)"
## [1] "Most common time interval for bfh in 2020 is 14531.7333333333 minutes (242.195555555556 hours)"
## [1] "Most common time interval for hes_so in 2020 is 1139.91666666667 minutes (18.9986111111111 hours)"
## [1] "Most common time interval for hslu in 2020 is 120 minutes (2 hours)"
## [1] "Most common time interval for ost_fh in 2020 is NA minutes (NA hours)"
## [1] "Most common time interval for supsi_ch in 2020 is 0.133333333333333 minutes (0.00222222222222222 hours)"
## [1] "Most common time interval for FHNW in 2021 is 0.5 minutes (0.00833333333333333 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2021 is 0.333333333333333 minutes (0.00555555555555555 hours)"
## [1] "Most common time interval for ZHAW in 2021 is 13043.9833333333 minutes (217.399722222222 hours)"
## [1] "Most common time interval for bfh in 2021 is 1411.05 minutes (23.5175 hours)"
## [1] "Most common time interval for hes_so in 2021 is 0 minutes (0 hours)"
## [1] "Most common time interval for hslu in 2021 is 0 minutes (0 hours)"
## [1] "Most common time interval for ost_fh in 2021 is 0.35 minutes (0.00583333333333333 hours)"
## [1] "Most common time interval for supsi_ch in 2021 is 1140 minutes (19 hours)"
## [1] "Most common time interval for FHNW in 2022 is 1439.93333333333 minutes (23.9988888888889 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2022 is 0.1 minutes (0.00166666666666667 hours)"
## [1] "Most common time interval for ZHAW in 2022 is 18623.7166666667 minutes (310.395277777778 hours)"
## [1] "Most common time interval for bfh in 2022 is 7192.96666666667 minutes (119.882777777778 hours)"
## [1] "Most common time interval for hes_so in 2022 is 5798.53333333333 minutes (96.6422222222222 hours)"
## [1] "Most common time interval for hslu in 2022 is 0 minutes (0 hours)"
## [1] "Most common time interval for ost_fh in 2022 is 0.133333333333333 minutes (0.00222222222222222 hours)"
## [1] "Most common time interval for supsi_ch in 2022 is 28800.7333333333 minutes (480.012222222222 hours)"
## [1] "Most common time interval for FHNW in 2023 is 9997.63333333333 minutes (166.627222222222 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2023 is 21962.3833333333 minutes (366.039722222222 hours)"
## [1] "Most common time interval for ZHAW in 2023 is 70740.3333333333 minutes (1179.00555555556 hours)"
## [1] "Most common time interval for bfh in 2023 is 8000.11666666667 minutes (133.335277777778 hours)"
## [1] "Most common time interval for hes_so in 2023 is 4621.1 minutes (77.0183333333333 hours)"
## [1] "Most common time interval for hslu in 2023 is 627.083333333333 minutes (10.4513888888889 hours)"
## [1] "Most common time interval for supsi_ch in 2023 is 7199 minutes (119.983333333333 hours)"
## [1] "Most common time interval for FH_Graubuenden in 2009 is NA minutes (NA hours)"
## [1] "Most common time interval for FH_Graubuenden in 2010 is 55732.2833333333 minutes (928.871388888889 hours)"
## [1] "Most common time interval for hes_so in 2010 is NA minutes (NA hours)"
langs <- c("de", "fr", "it", "en")
tweets_filtered <- tweets %>%
filter(lang %in% langs)
# Define extended stopwords (outside loop for efficiency)
# Remove 'amp' as it is not meaningful because its only & symbol
# Remove 'rt' because it is an word e.g 'engagiert'.
extended_stopwords <- c(
"#fhnw", "#bfh", "@htw_chur", "#hslu", "#supsi", "#sups",
"amp", "rt", "fr", "ber"
)
# Create separate DFMs for each language
dfm_list <- list()
for (sel_lang in langs) {
# Subset tweets for the current language
tweets_lang <- tweets_filtered %>%
filter(lang == sel_lang)
# Create tokens for the current language
stopwords_lang <- stopwords(sel_lang)
# Create tokens for all tweets:
# - create corpus and tokens because tokensonly works on character, corpus, list, tokens, tokens_xptr objects.
# - create tokens and remove: URLS, Punctuation, Numbers, Symbols, Separators
# - transform to lowercase
# - Stem all words
# - Create n-grams of any length (not includinf bigrams and trigrams but they are shown later)
# - It is important to remove the stopwords after stemming the words because we remove the endings from some stem words
tokens_lang <- tweets_lang %>%
corpus(text_field = "full_text_emojis") %>%
tokens(
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
remove_url = TRUE, remove_separators = TRUE
) %>%
tokens_tolower() %>%
tokens_wordstem(lang = sel_lang) %>%
tokens_ngrams(n = 1) %>%
tokens_select(
pattern =
c(stopwords_lang, extended_stopwords), selection = "remove"
)
# Create DFM for the current language
dfm_list[[sel_lang]] <- dfm(tokens_lang)
}
# Word Frequencies & Visualization
words_freqs_en <- sort(colSums(dfm_list$en), decreasing = TRUE)
head(words_freqs_en, 20)
## student new @hslu univers project thank
## 106 74 70 62 60 60
## @zhaw day scienc today innov now
## 59 56 54 52 51 50
## swiss switzerland @fhnw great us join
## 49 49 46 46 44 43
## studi research
## 42 42
wordcloud2(data.frame(
word = names(words_freqs_en),
freq = words_freqs_en
), size = 0.5)
words_freqs_de <- sort(colSums(dfm_list$de), decreasing = TRUE)
head(words_freqs_de, 20)
## neu mehr schweiz werd all studier heut hochschul
## 1586 1104 967 772 706 706 638 601
## bfh jahr knnen digital thema studi projekt welch
## 577 535 507 499 497 466 465 462
## bern statt zeigt arbeit
## 454 451 437 434
wordcloud2(data.frame(
word = names(words_freqs_de),
freq = words_freqs_de
), size = 0.5)
word_freqs_it <- sort(colSums(dfm_list$it), decreasing = TRUE)
head(word_freqs_it, 20)
## nuov sups progett student present info
## 210 208 173 146 143 143
## iscrizion cors ricerc formazion #supsinews #supsievent
## 142 141 135 134 134 129
## scopr inform diplom bachelor apert tutt
## 123 120 116 111 110 105
## master pi
## 103 102
wordcloud2(data.frame(
word = names(word_freqs_it),
freq = word_freqs_it
), size = 0.5)
# It seems that there are some english words but I think this are emojis
word_freqs_fr <- sort(colSums(dfm_list$fr), decreasing = TRUE)
head(word_freqs_fr, 20)
## hes-so right arrow dan projet a tudi haut
## 505 432 324 249 248 234 199 183
## col @hes_so @hessoval dcouvr book open recherch #hes_so
## 155 140 129 127 123 118 117 115
## suiss plus mast nouveau
## 110 105 103 98
wordcloud2(data.frame(
word = names(word_freqs_fr),
freq = word_freqs_fr
), size = 0.5)
# University-specific Analysis
for (uni in unique(tweets$university)) {
# Subset tweets for the current language
uni_tweets <- tweets_filtered %>%
filter(university == uni)
tokens_lang <- uni_tweets %>%
corpus(text_field = "full_text_emojis") %>%
tokens(
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
remove_url = TRUE, remove_separators = TRUE
) %>%
tokens_tolower() %>%
tokens_wordstem() %>%
tokens_ngrams(n = 1) %>%
tokens_select(
pattern =
c(
stopwords("en"), stopwords("de"),
stopwords("fr"), stopwords("it"), extended_stopwords
), selection = "remove"
)
# Create Data Frame Matrix for uni with all languages
uni_dfm <- dfm(tokens_lang)
# Word Frequencies
uni_word_freqs <- sort(colSums(uni_dfm), decreasing = TRUE)
# print most common words: the emoji right are used often
head(uni_word_freqs, 20)
wordcloud2(data.frame(
word = names(uni_word_freqs),
freq = uni_word_freqs
), size = 0.5)
}
# Calculate a 'weighted engagement' metric
tweets <- tweets %>%
mutate(
weighted_engagement = favorite_count * 1 + retweet_count * 2
)
# Identify tweets with the highest weighted engagement
most_engaged_tweets <- tweets %>%
arrange(desc(weighted_engagement)) %>%
head(1000) # Top 1000 for analysis
# Analyze posting time of most engaged tweets (same as before)
most_engaged_tweets_time <- most_engaged_tweets %>%
mutate(time_of_day = format(created_at, "%H"))
ggplot(most_engaged_tweets_time, aes(x = as.numeric(time_of_day))) +
geom_histogram(binwidth = 1, fill = "lightblue", color = "blue") +
labs(
title = "Distribution of Posting Times for Most Engaged Tweets",
x = "Hour of Day",
y = "Frequency"
)
Analyse the content of the most liked tweets
# Preprocessing content of most liked tweets
tokens_most_engaged <- most_engaged_tweets %>%
corpus(text_field = "full_text_emojis") %>%
tokens(
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
remove_url = TRUE, remove_separators = TRUE
) %>%
tokens_tolower() %>%
tokens_wordstem(lang = sel_lang) %>%
tokens_ngrams(n = 1) %>%
tokens_select(
pattern =
c(
stopwords("en"), stopwords("de"),
stopwords("fr"), stopwords("it"), extended_stopwords
), selection = "remove"
)
tokens_most_engaged_dfm <- dfm(tokens_most_engaged)
freqs_most_engaged <- sort(colSums(tokens_most_engaged_dfm), decreasing = TRUE)
# print most common words: the emoji right are used often
head(freqs_most_engaged, 20)
## mehr neue schweiz schweizer right
## 81 67 48 47 46
## heut zeigt #hsluinformatik studi zhaw
## 44 41 40 39 39
## hes-so knnen neuen hochschul campus
## 38 38 36 34 33
## innov gibt ab entwickelt bfh
## 31 30 30 30 30
set.seed(123)
wordcloud2(data.frame(
word = names(freqs_most_engaged),
freq = freqs_most_engaged
), size = 0.5)
for (uni in unique(tweets$university)) {
uni_tweets <- tweets %>%
filter(university == uni, lang %in% langs)
tokens_uni <- uni_tweets %>%
corpus(text_field = "full_text_emojis") %>%
tokens(
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
remove_url = TRUE, remove_separators = TRUE
) %>%
tokens_tolower() %>%
tokens_wordstem() %>%
tokens_ngrams(n = 1) %>%
tokens_select(
pattern =
c(
stopwords("en"), stopwords("de"),
stopwords("fr"), stopwords("it"), extended_stopwords
), selection = "remove"
)
uni_dfm <- dfm(tokens_uni)
freqs_uni <- sort(colSums(uni_dfm), decreasing = TRUE)
# print most common words: the emoji right are used often
head(freqs_uni, 20)
set.seed(123)
wordcloud2(data.frame(
word = names(freqs_uni),
freq = freqs_uni
), size = 0.5)
# Analyze Top Emojis by University
emoji_count_per_university <- uni_tweets %>%
top_n_emojis(full_text)
print(emoji_count_per_university)
emoji_count_per_university %>%
mutate(emoji_name = reorder(emoji_name, n)) %>%
ggplot(aes(n, emoji_name)) +
geom_col() +
labs(x = "Count", y = NULL, title = "Top 20 Emojis Used")
}
## # A tibble: 20 × 4
## emoji_name unicode emoji_category n
## <chr> <chr> <chr> <int>
## 1 backhand_index_pointing_right 👉 People & Body 56
## 2 yellow_heart 💛 Smileys & Emotion 34
## 3 black_heart 🖤 Smileys & Emotion 32
## 4 woman 👩 People & Body 28
## 5 man 👨 People & Body 17
## 6 clap 👏 People & Body 16
## 7 flag_Switzerland 🇨🇭 Flags 15
## 8 microscope 🔬 Objects 15
## 9 computer 💻 Objects 14
## 10 graduation_cap 🎓 Objects 13
## 11 school 🏫 Travel & Places 13
## 12 face_with_medical_mask 😷 Smileys & Emotion 12
## 13 raised_hands 🙌 People & Body 12
## 14 robot 🤖 Smileys & Emotion 12
## 15 female_sign ♀️ Symbols 10
## 16 trophy 🏆 Activities 9
## 17 woman_scientist 👩🔬 People & Body 9
## 18 party_popper 🎉 Activities 8
## 19 star_struck 🤩 Smileys & Emotion 8
## 20 sun_with_face 🌞 Travel & Places 8
## # A tibble: 20 × 4
## emoji_name unicode emoji_category n
## <chr> <chr> <chr> <int>
## 1 party_popper 🎉 Activities 18
## 2 grinning_face_with_big_eyes 😃 Smileys & Emotion 15
## 3 blush 😊 Smileys & Emotion 8
## 4 smiling_face_with_sunglasses 😎 Smileys & Emotion 8
## 5 bulb 💡 Objects 7
## 6 +1 👍 People & Body 6
## 7 camera_flash 📸 Objects 6
## 8 flexed_biceps 💪 People & Body 6
## 9 four_leaf_clover 🍀 Animals & Nature 6
## 10 grinning_face_with_smiling_eyes 😄 Smileys & Emotion 6
## 11 heart_eyes 😍 Smileys & Emotion 6
## 12 hugs 🤗 Smileys & Emotion 6
## 13 female_sign ♀️ Symbols 4
## 14 graduation_cap 🎓 Objects 4
## 15 grinning 😀 Smileys & Emotion 4
## 16 robot 🤖 Smileys & Emotion 4
## 17 backhand_index_pointing_down 👇 People & Body 3
## 18 computer 💻 Objects 3
## 19 lady_beetle 🐞 Animals & Nature 3
## 20 ocean 🌊 Travel & Places 3
## # A tibble: 20 × 4
## emoji_name unicode emoji_category n
## <chr> <chr> <chr> <int>
## 1 backhand_index_pointing_right 👉 People & Body 21
## 2 high_voltage ⚡ Travel & Places 11
## 3 wink 😉 Smileys & Emotion 9
## 4 clap 👏 People & Body 5
## 5 flag_Switzerland 🇨🇭 Flags 5
## 6 rocket 🚀 Travel & Places 5
## 7 +1 👍 People & Body 4
## 8 arrow_right ➡️ Symbols 4
## 9 bug 🐛 Animals & Nature 3
## 10 computer 💻 Objects 3
## 11 flexed_biceps 💪 People & Body 3
## 12 man 👨 People & Body 3
## 13 bangbang ‼️ Symbols 2
## 14 dark_skin_tone 🏿 Component 2
## 15 exclamation ❗ Symbols 2
## 16 female_sign ♀️ Symbols 2
## 17 four_leaf_clover 🍀 Animals & Nature 2
## 18 green_salad 🥗 Food & Drink 2
## 19 grinning 😀 Smileys & Emotion 2
## 20 medium_light_skin_tone 🏼 Component 2
## # A tibble: 20 × 4
## emoji_name unicode emoji_category n
## <chr> <chr> <chr> <int>
## 1 backhand_index_pointing_right 👉 People & Body 49
## 2 battery 🔋 Objects 16
## 3 backhand_index_pointing_down 👇 People & Body 12
## 4 woman 👩 People & Body 12
## 5 palm_tree 🌴 Animals & Nature 11
## 6 bulb 💡 Objects 10
## 7 computer 💻 Objects 10
## 8 evergreen_tree 🌲 Animals & Nature 10
## 9 graduation_cap 🎓 Objects 10
## 10 party_popper 🎉 Activities 10
## 11 robot 🤖 Smileys & Emotion 10
## 12 clap 👏 People & Body 9
## 13 coconut 🥥 Food & Drink 9
## 14 date 📅 Objects 9
## 15 deciduous_tree 🌳 Animals & Nature 9
## 16 flag_Switzerland 🇨🇭 Flags 9
## 17 rocket 🚀 Travel & Places 9
## 18 automobile 🚗 Travel & Places 8
## 19 clinking_glasses 🥂 Food & Drink 8
## 20 seedling 🌱 Animals & Nature 8
## # A tibble: 20 × 4
## emoji_name unicode emoji_category n
## <chr> <chr> <chr> <int>
## 1 arrow_right ➡️ Symbols 320
## 2 arrow_heading_down ⤵️ Symbols 245
## 3 book 📖 Objects 115
## 4 mag_right 🔎 Objects 97
## 5 mega 📣 Objects 53
## 6 clapper 🎬 Objects 38
## 7 NEW_button 🆕 Symbols 35
## 8 computer 💻 Objects 35
## 9 microscope 🔬 Objects 32
## 10 bulb 💡 Objects 29
## 11 police_car_light 🚨 Travel & Places 27
## 12 backhand_index_pointing_right 👉 People & Body 26
## 13 graduation_cap 🎓 Objects 23
## 14 studio_microphone 🎙️ Objects 23
## 15 clap 👏 People & Body 21
## 16 date 📅 Objects 17
## 17 medal_sports 🏅 Activities 15
## 18 memo 📝 Objects 15
## 19 woman 👩 People & Body 15
## 20 flag_Switzerland 🇨🇭 Flags 14
## # A tibble: 20 × 4
## emoji_name unicode emoji_category n
## <chr> <chr> <chr> <int>
## 1 sparkles ✨ Activities 28
## 2 flag_Switzerland 🇨🇭 Flags 18
## 3 rocket 🚀 Travel & Places 12
## 4 party_popper 🎉 Activities 11
## 5 partying_face 🥳 Smileys & Emotion 9
## 6 Christmas_tree 🎄 Activities 7
## 7 clap 👏 People & Body 7
## 8 star ⭐ Travel & Places 7
## 9 bottle_with_popping_cork 🍾 Food & Drink 6
## 10 bulb 💡 Objects 5
## 11 glowing_star 🌟 Travel & Places 5
## 12 smiling_face_with_sunglasses 😎 Smileys & Emotion 5
## 13 +1 👍 People & Body 4
## 14 camera_flash 📸 Objects 4
## 15 clinking_glasses 🥂 Food & Drink 4
## 16 four_leaf_clover 🍀 Animals & Nature 4
## 17 musical_notes 🎶 Objects 4
## 18 person_running 🏃 People & Body 4
## 19 raised_hands 🙌 People & Body 4
## 20 robot 🤖 Smileys & Emotion 4
## # A tibble: 20 × 4
## emoji_name unicode emoji_category n
## <chr> <chr> <chr> <int>
## 1 graduation_cap 🎓 Objects 3
## 2 man 👨 People & Body 2
## 3 man_student 👨🎓 People & Body 2
## 4 rocket 🚀 Travel & Places 2
## 5 snowflake ❄️ Travel & Places 2
## 6 backhand_index_pointing_right 👉 People & Body 1
## 7 brain 🧠 People & Body 1
## 8 chocolate_bar 🍫 Food & Drink 1
## 9 clapper 🎬 Objects 1
## 10 eyes 👀 People & Body 1
## 11 fire 🔥 Travel & Places 1
## 12 flexed_biceps 💪 People & Body 1
## 13 grinning 😀 Smileys & Emotion 1
## 14 heart_eyes_cat 😻 Smileys & Emotion 1
## 15 high_voltage ⚡ Travel & Places 1
## 16 mantelpiece_clock 🕰️ Travel & Places 1
## 17 sleeping 😴 Smileys & Emotion 1
## 18 slightly_smiling_face 🙂 Smileys & Emotion 1
## 19 sun ☀️ Travel & Places 1
## 20 woman 👩 People & Body 1
## # A tibble: 20 × 4
## emoji_name unicode emoji_category n
## <chr> <chr> <chr> <int>
## 1 arrow_right ➡️ Symbols 83
## 2 backhand_index_pointing_right 👉 People & Body 21
## 3 graduation_cap 🎓 Objects 19
## 4 arrow_forward ▶️ Symbols 18
## 5 bulb 💡 Objects 10
## 6 rocket 🚀 Travel & Places 9
## 7 party_popper 🎉 Activities 8
## 8 flag_Switzerland 🇨🇭 Flags 7
## 9 clap 👏 People & Body 6
## 10 exclamation ❗ Symbols 5
## 11 SOON_arrow 🔜 Symbols 4
## 12 grinning_face_with_big_eyes 😃 Smileys & Emotion 4
## 13 camera_flash 📸 Objects 3
## 14 computer 💻 Objects 3
## 15 movie_camera 🎥 Objects 3
## 16 rainbow 🌈 Travel & Places 3
## 17 studio_microphone 🎙️ Objects 3
## 18 woman 👩 People & Body 3
## 19 Christmas_tree 🎄 Activities 2
## 20 backhand_index_pointing_down 👇 People & Body 2
# Generate general tokens for bigram and trigram analysis
tokens <- tweets %>%
corpus(text_field = "full_text_emojis") %>%
tokens(
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
remove_url = TRUE, remove_separators = TRUE
) %>%
tokens_tolower() %>%
tokens_wordstem() %>%
tokens_select(
pattern =
c(
stopwords("en"), stopwords("de"),
stopwords("fr"), stopwords("it"), extended_stopwords
), selection = "remove"
)
# Bigram Wordcloud
bi_gram_tokens <- tokens_ngrams(tokens, n = 2)
dfm_bi_gram <- dfm(bi_gram_tokens)
freqs_bi_gram <- sort(colSums(dfm_bi_gram), decreasing = TRUE)
head(freqs_bi_gram, 20)
## right_arrow htw_chur index_point
## 421 259 207
## backhand_index hochschul_luzern point_right
## 206 185 183
## berner_fachhochschul sozial_arbeit prof_dr
## 157 154 142
## haut_cole herzlich_gratul open_book
## 141 139 117
## magnifi_glass glass_tilt tilt_right
## 97 97 97
## fh_graubnden neusten_blogbeitrag book_#revuehmisphr
## 91 87 85
## social_media advanc_studi
## 84 83
# Create the bigram word cloud
set.seed(123)
wordcloud2(data.frame(
word = names(freqs_bi_gram),
freq = freqs_bi_gram
), size = 0.5)
# Trigram Wordcloud
tri_gram_tokens <- tokens_ngrams(tokens, n = 3)
dfm_tri_gram <- dfm(tri_gram_tokens)
reqs_tri_gram <- sort(colSums(dfm_tri_gram), decreasing = TRUE)
head(reqs_tri_gram, 20)
## backhand_index_point index_point_right
## 206 183
## magnifi_glass_tilt glass_tilt_right
## 97 97
## open_book_#revuehmisphr hochschul_gestaltung_kunst
## 85 62
## dipartimento_tecnologi_innov master_advanc_studi
## 40 38
## depart_sozial_arbeit #infoanlass_mrz_findet
## 36 33
## polic_car_light univers_appli_scienc
## 32 31
## busi_administr_statt findet_#zrich_infoanlass
## 30 30
## tag_offenen_tr hochschul_life_scienc
## 29 29
## gestaltung_kunst_fhnw mas_busi_administr
## 29 28
## mehr_neuen_blogbeitrag mehr_neusten_blogbeitrag
## 28 28
# Create the bigram word cloud
set.seed(123)
wordcloud2(data.frame(
word = names(reqs_tri_gram),
freq = reqs_tri_gram
), size = 0.5)
# Source: Christoph Zangger -> löscht alle Reihen mit nur 0s
new_dfm <- dfm_subset(dfm_list$en, ntoken(dfm_list$en) > 0)
tweet_lda <- LDA(new_dfm, k = 5, control = list(seed = 123))
# Tidy the LDA results
topic_terms <- tidy(tweet_lda, matrix = "beta")
# Extract topics and top terms
topics <- as.data.frame(terms(tweet_lda, 50)) # First fifty words per topic
# Extract top terms per topic
top_terms <- topic_terms %>%
group_by(topic) %>%
top_n(8, beta) %>% # Show top 8 terms per topic
ungroup() %>%
arrange(topic, -beta)
# Visualize top terms per topic
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~topic, scales = "free") +
scale_y_reordered() +
labs(
x = "Beta (Term Importance within Topic)",
y = NULL,
title = "Top Terms per Topic in Tweets (LDA)"
)
# Most different words among topics (using log ratios)
diff <- topic_terms %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001 | topic3 > .001) %>%
mutate(
logratio_t1t2 = log2(topic2 / topic1),
logratio_t1t3 = log2(topic3 / topic1),
logratio_t2t3 = log2(topic3 / topic2)
)
diff
## # A tibble: 318 × 9
## term topic1 topic2 topic3 topic4 topic5 logratio_t1t2 logratio_t1t3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 @academi… 7.84e-4 2.67e-3 3.06e-3 2.69e-3 1.60e-3 1.77 1.96
## 2 @bfh_hesb 3.20e-3 1.80e-3 4.26e-3 5.17e-3 9.55e-4 -0.832 0.414
## 3 @eth_en 4.35e-4 1.53e-3 1.51e-4 3.36e-4 4.54e-4 1.82 -1.52
## 4 @fh_grau… 1.52e-4 3.37e-4 1.82e-3 5.07e-4 1.75e-3 1.15 3.59
## 5 @fhnw 1.01e-2 5.26e-4 3.28e-3 3.29e-3 1.89e-3 -4.27 -1.63
## 6 @fhnwbusi 3.44e-3 2.31e-3 5.31e-3 3.71e-4 5.18e-3 -0.577 0.627
## 7 @globalc… 1.18e-3 5.89e-4 1.82e-4 4.27e-4 5.33e-4 -0.999 -2.70
## 8 @greater… 1.21e-4 6.38e-4 1.91e-3 1.92e-3 8.10e-4 2.40 3.98
## 9 @grstift… 2.02e-3 3.23e-3 7.09e-4 9.99e-4 1.77e-3 0.677 -1.51
## 10 @hes_so 1.38e-3 2.38e-3 1.69e-3 5.81e-4 1.86e-3 0.786 0.290
## # ℹ 308 more rows
## # ℹ 1 more variable: logratio_t2t3 <dbl>
# LDA Topic Modeling for each university
universities <- unique(tweets$university)
for (uni in universities) {
# Filter tweets for the current university
uni_tweets <- tweets %>% filter(university == uni)
tokens_uni <- uni_tweets %>%
corpus(text_field = "full_text_emojis") %>%
tokens(
remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE,
remove_url = TRUE, remove_separators = TRUE
) %>%
tokens_tolower() %>%
tokens_wordstem() %>%
tokens_ngrams(n = 1) %>%
tokens_select(
pattern =
c(
stopwords("en"), stopwords("de"),
stopwords("fr"), stopwords("it"), extended_stopwords
), selection = "remove"
)
uni_dfm <- dfm(tokens_uni)
# Apply LDA
uni_dfm <- dfm_subset(uni_dfm, ntoken(uni_dfm) > 0)
tweet_lda <- LDA(uni_dfm, k = 5, control = list(seed = 123))
# Tidy the LDA results
tweet_lda_td <- tidy(tweet_lda)
# Extract top terms per topic
top_terms <- tweet_lda_td %>%
group_by(topic) %>%
top_n(8, beta) %>%
ungroup() %>%
arrange(topic, -beta)
# Visualize top terms per topic
p <- top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~topic, scales = "free") +
scale_y_reordered() +
labs(
x = "Beta (Term Importance within Topic)",
y = NULL,
title = paste("Top Terms per Topic in Tweets from", uni, "(LDA)")
)
print(p)
# Topic Model Summary: top 10 terms per topic
cat("\nTopic Model Summary for", uni, ":\n")
print(as.data.frame(terms(tweet_lda, 10)))
}
##
## Topic Model Summary for FHNW :
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## 1 hochschul @fhnwbusi fhnw mehr @hsafhnw
## 2 @fhnwtechnik @fhnw mehr @fhnwbusi @fhnwbusi
## 3 @hsafhnw ab heut @fhnw @fhnw
## 4 neue mehr studierend challeng hochschul
## 5 fhnw interview projekt studierend neuen
## 6 campus @fhnwpsychologi dr projekt campus
## 7 mehr hochschul neue fhnw @fhnwpsychologi
## 8 gestaltung kinder @fhnwbusi @hsafhnw @fhnwtechnik
## 9 dass statt @fhnwpsychologi @fhnwtechnik mehr
## 10 kunst #campusolten backhand swiss schweiz
##
## Topic Model Summary for FH_Graubuenden :
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## 1 chur htw #htwchur findet chur
## 2 htw mehr blogbeitrag chur statt
## 3 #graubnden #fhgr statt statt #htwchur
## 4 #htwchur statt #fhgr infoanlass blogbeitrag
## 5 blogbeitrag graubnden #infoanlass #htwchur findet
## 6 graubnden findet graubnden mehr thema
## 7 mehr neuen tourismus #infoanlass @suedostschweiz
## 8 #chur chur manag htw studierenden
## 9 #schweiz #studium findet busi oktob
## 10 heut #infoanlass studierend heut face
##
## Topic Model Summary for ZHAW :
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## 1 zhaw @zhaw gibt @engineeringzhaw zhaw
## 2 heut #zhawimpact @zhaw schweiz @zhaw
## 3 #zhaw via @engineeringzhaw zeigt winterthur
## 4 dank zhaw cc #zhaw neuen
## 5 @zhaw mehr schweizer cc @sml_zhaw
## 6 @c_caviglia #zhaw neue neue knnen
## 7 via ab mehr studi @engineeringzhaw
## 8 cc cc heut #tonitag menschen
## 9 wdenswil neue studierenden dank neue
## 10 studi winterthur zhaw knnen mehr
##
## Topic Model Summary for bfh :
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## 1 bfh thema bern bfh berner
## 2 mehr neue @bfh_hesb @bfh_hesb neue
## 3 bern statt fachhochschul bern thema
## 4 berner @hkb_bfh http berner bern
## 5 fachhochschul bern mehr mehr @bfh_hesb
## 6 biel neuen neue knnen bfh
## 7 heut mehr thema studi studi
## 8 entwickelt onlin knnen innen herzlich
## 9 anmelden zeigt neu #knoten_maschen schweizer
## 10 innen bfh neuen welch sozial
##
## Topic Model Summary for hes_so :
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## 1 dan hes-so hes-so open right
## 2 arrow right right book hes-so
## 3 projet haut arrow haut arrow
## 4 plus arrow tudiant projet dan
## 5 scienc @hessovalai projet @hes_so projet
## 6 book domain magnifi cole @hes_so
## 7 right recherch cole dan #hes_so
## 8 dcouvrez tudiant plus #revuehmisphr master
## 9 tudiant cole haut master tilt
## 10 @hessovalai professeur #hes_so tudiant recherch
##
## Topic Model Summary for hslu :
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## 1 luzern mehr luzern #hsluinformatik mehr
## 2 @hslu @hslu #hsluinformatik @hslu hochschul
## 3 mehr schweizer neue neue @hslu
## 4 depart depart depart knnen neuen
## 5 schweizer hochschul @hslu gibt zeigt
## 6 ab heut schweiz zeigt geht
## 7 kunst kunst menschen #hsluwirtschaft interview
## 8 studierenden design entwickelt hochschul studi
## 9 design jahr heut projekt jahr
## 10 arbeit studium geht welch arbeit
##
## Topic Model Summary for ost_fh :
## Topic 1 Topic 2
## 1 #ostschweizerfachhochschul ost
## 2 neue #ostschweizerfachhochschul
## 3 @ost_fh mehr
## 4 podcast ostschweiz
## 5 #podcast studium
## 6 kontrast fachhochschul
## 7 neuen menschen
## 8 kulturzyklus @ozg_ost
## 9 #kulturzyklus ab
## 10 zwei detail
## Topic 3 Topic 4
## 1 @ost_fh @ost_fh
## 2 @ozg_ost #ostschweizerfachhochschul
## 3 st.gallen ost
## 4 #ostschweizerfachhochschul @ost_wi
## 5 mehr rapperswil
## 6 ost @ozg_ost
## 7 @eastdigit drei
## 8 #informatik septemb
## 9 neu #ost
## 10 spannend campus
## Topic 5
## 1 @ost_fh
## 2 #ostschweizerfachhochschul
## 3 mehr
## 4 ost
## 5 prof
## 6 @insrapperswil
## 7 herzlich
## 8 projekt
## 9 st.gallen
## 10 ostschweiz
##
## Topic Model Summary for supsi_ch :
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## 1 #supsiev informazioni supsi iscrizioni supsi
## 2 @supsi_ch bachelor #supsiev right formazion
## 3 lavoro info #supsinew #supsiev pi
## 4 info oggi manag innov nuovo
## 5 nuovo progetto formazion ticino #supsinew
## 6 studenti @usi_univers settembr info progetto
## 7 #supsinew studi master oggi social
## 8 @usi_univers apert presentazion master apert
## 9 supsi scopri tecnologi arrow ricerca
## 10 progetto studenti dipartimento conferenza corsi
tweets %>%
mutate(tweet_length = nchar(full_text)) %>%
ggplot(aes(x = tweet_length)) +
geom_histogram() +
labs(title = "Distribution of Tweet Lengths")
### Sentiment Analysis
# Calculate Sentiment for Supported Languages Only
langs <- c("de", "fr", "it", "en")
tweets_filtered <- tweets %>%
filter(lang %in% langs)
# Create Function to Get Syuzhet Sentiment
get_syuzhet_sentiment <- function(text, lang) {
# Check if language is supported
if (lang %in% langs) {
return(get_sentiment(text, method = "syuzhet", lang = lang))
} else {
return(NA) # Return NA for unsupported languages
}
}
# Calculate Syuzhet Sentiment for each Tweet
tweets_filtered$sentiment <-
mapply(get_syuzhet_sentiment, tweets_filtered$full_text, tweets_filtered$lang)
plot_data <- tweets_filtered %>%
group_by(university, month) %>%
summarize(mean_sentiment_syuzhet = mean(sentiment, na.rm = TRUE))
# Plot Syuzhet Sentiment by all Universities
ggplot(plot_data, aes(
x = month,
y = mean_sentiment_syuzhet,
color = university, group = university
)) +
geom_line() +
labs(
title = "Mean Syuzhet Sentiment Over Time by University",
y = "Mean Sentiment Score"
) +
scale_x_datetime(date_breaks = "1 month", date_labels = "%Y-%m") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
for (uni in unique(tweets$university)) {
uni_tweets <- tweets %>%
filter(university == uni, lang %in% langs)
uni_tweets$sentiment <-
mapply(get_syuzhet_sentiment, uni_tweets$full_text, uni_tweets$lang)
plot_data <- uni_tweets %>%
group_by(year, month) %>%
summarize(mean_sentiment = mean(sentiment, na.rm = TRUE))
# Plot Syuzhet Sentiment Over Time (Per University)
print(ggplot(plot_data, aes(x = month, y = mean_sentiment)) +
geom_line(aes(color = as.factor(year))) +
labs(
title = paste0("Mean Syuzhet Sentiment Over Time by - ", uni),
y = "Mean Sentiment Score"
) +
scale_x_datetime(date_breaks = "1 month", date_labels = "%Y-%m") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_wrap(~year, scales = "free_x"))
# Did not found a way to get the sentiment from the tweets for each language so I will use the full_text_emojis column and detect the language of the words only in german
uni_tweets_de <- uni_tweets %>%
filter(lang == "de")
# Tokenize and Preprocess Words
uni_words <- uni_tweets_de %>%
unnest_tokens(word, full_text_emojis) %>%
anti_join(get_stopwords(language = "de")) %>%
distinct() %>% # remove duplicated words
filter(nchar(word) > 3) %>% # remove words with less than 4 characters
filter(!str_detect(word, "\\d")) # remove numbers
# Join Sentiment with Words (language specific)
sentiment_words <- uni_words %>%
mutate(
sentiment = get_sentiment(word, method = "syuzhet", lang = "de")
)
# Separate Positive and Negative Words
positive_words <- sentiment_words %>%
filter(sentiment >= 0) %>%
count(word, sort = TRUE) %>%
rename(freq = n) # Rename 'n' to 'freq'
negative_words <- sentiment_words %>%
filter(sentiment < 0) %>%
count(word, sort = TRUE) %>%
rename(freq = n) # Rename 'n' to 'freq'
# Create and Display Word Clouds
# positive words
wordcloud2(data.frame(
word = positive_words$word,
freq = positive_words$freq
), size = 0.5)
# negative words
wordcloud2(data.frame(
word = negative_words$word,
freq = negative_words$freq
), size = 0.5)
}
# Language Analysis
tweets %>%
count(lang) %>%
arrange(desc(n))
## # A tibble: 127 × 3
## # Groups: university [8]
## university lang n
## <chr> <chr> <int>
## 1 hslu de 2902
## 2 bfh de 2760
## 3 ZHAW de 2712
## 4 FHNW de 2353
## 5 FH_Graubuenden de 2245
## 6 supsi_ch it 1786
## 7 hes_so fr 1663
## 8 FH_Graubuenden <NA> 374
## 9 ost_fh de 248
## 10 ZHAW <NA> 225
## # ℹ 117 more rows
# Emoji Analysis
emoji_count <- tweets %>%
top_n_emojis(full_text)
emoji_count %>%
mutate(emoji_name = reorder(emoji_name, n)) %>%
ggplot(aes(n, emoji_name)) +
geom_col() +
labs(x = "Count", y = NULL, title = "Top 20 Emojis Used")
insights <- list(
"Most Active Hours" = hours_with_most_tweets_by_uni,
"Most Active Days" = days_with_most_tweets_by_uni,
"Content Analysis" = head(words_freqs_de),
"Sentiment Analysis" = head(tweets_filtered$sentiment)
)